import os
import re
import requests
import time
import random
from bs4 import BeautifulSoup


# Function to get the final URL after redirection
def get_final_url(link, sleep_time=0.05):
    try:
        time.sleep(sleep_time + random.uniform(0, sleep_time / 10))
        r = requests.head(link, allow_redirects=True)
        print(f"Final URL for {link}: {r.url}")
        return r.url
    except requests.RequestException as e:
        print(f"Error getting final URL for {link}: {e}")
        return link


# Function to get content of textarea from a given URL
def get_textarea_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        textarea = soup.find('textarea', {'id': 'wpTextbox1'})
        title = soup.find('title').text if soup.find('title') else 'No title found'
        if textarea:
            if title:
                title = title.replace('View source for', '').replace('_Minecraft_Wiki', '').strip()
                title = re.sub(r'[^a-zA-Z0-9]', '_', title)
                title = re.sub(r'_+', '_', title)  # Replace multiple continuous underscores with a single '_'
                title = title.lstrip('_')  # Remove leading underscores
            return textarea.text, title
        else:
            return None, None
    except requests.RequestException as e:
        print(f"Failed to retrieve content from {url}: {e}")
        return None, None
    

if __name__ == '__main__':
    # URL of the page to scrape
    url = 'https://minecraft.fandom.com'  # Replace with the actual URL
    suffix = '/wiki/Tutorials'

    # Send a request to fetch the content of the page
    response = requests.get(url + suffix)
    html_content = response.content

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the div with class 'mw-parser-output'
    mw_parser_output_div = soup.find('div', class_='mw-parser-output')

    # Dictionary to store links sorted by headings
    sorted_links = {}
    # Iterate through the elements in the 'mw-parser-output' div
    current_heading = None
    if mw_parser_output_div:
        div_count = 0
        for i, element in enumerate(mw_parser_output_div.children):
            if element.name and element.name.startswith('h'):
                # If the element is a heading, set the current heading
                current_heading = element.get_text(strip=True)
                if current_heading not in sorted_links:
                    sorted_links[current_heading] = set()
            elif element.name == 'ul':
                # If the element is a list, find all links within it
                if current_heading:
                    for li in element.find_all('li'):
                        a_tag = li.find('a', href=True)
                        if a_tag and a_tag['href'].startswith('/wiki'):
                            link = url + a_tag['href']
                            final_link = get_final_url(link)
                            if '#' in final_link:
                                final_link = final_link.split('#')[0]
                            sorted_links[current_heading].add(final_link)
            elif element.name == 'div':
                div_count += 1
                if div_count - 1 in [0, 1, 12, 13]:
                    continue
                # If the element is a div of list, find all links within it
                if current_heading:
                    for li in element.find_all('li'):
                        a_tag = li.find('a', href=True)
                        if a_tag and a_tag['href'].startswith('/wiki'):
                            link = url + a_tag['href']
                            final_link = get_final_url(link)
                            if '#' in final_link:
                                final_link = final_link.split('#')[0]
                            sorted_links[current_heading].add(final_link)
                
    # Filter links (example filter: exclude links containing certain keywords)
    filter_keywords = ['Servers', 'Technical', 'CreatingMinecraftmedia', 'Bedrock Edition only', 'Other', 'Outdated tutorials']
    filtered_sorted_links = {
        heading: links
        for heading, links in sorted_links.items()
        if not any(keyword in heading for keyword in filter_keywords)
    }
    filtered_sorted_links_list = [link for links in filtered_sorted_links.values() for link in links]

    # Save all the collected links
    with open('wiki_links.txt', 'w') as f:
        for link in filtered_sorted_links_list:
            f.write(link + '\n')

    f.close()
    # exit(0)

    # Iterate over each link, fetch the content and save it
    links = [_.strip() for _ in open('wiki_links.txt', 'r').readlines()]
    save_dir = 'wiki_raw'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    error_links_file = open(os.path.join(save_dir, 'error_links.txt'), 'w', encoding='utf-8')
    no_title_count = 0
    for link in links:
        skip = False
        for key in ['Custom', 'Edition']:
            if key in link:
                skip = True
                break
        if skip:
            continue
        edit_url = f"{link}?action=edit"
        content, title = get_textarea_content(edit_url)
        if content:
            print(f"Content from {edit_url}")
            if title:
                f = open(os.path.join(save_dir, f'{title}.txt'), 'w', encoding='utf-8')
            else:
                f = open(os.path.join(save_dir, f'no_title_{no_title_count}.txt'), 'w', encoding='utf-8')
                no_title_count += 1
            f.write(content)
            f.close()
        else:
            print(f"No content found at {edit_url}")
            error_links_file.write(f"{edit_url}\n")

        # Sleep to avoid overwhelming the server
        time.sleep(2 + random.uniform(0, 1))  # You can adjust the sleep time as needed

    error_links_file.close()
